import pandas as pd
import numpy as np
import datetime as dt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pycountry
import re
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib
from matplotlib.pyplot import figure
%matplotlib inline
post_df=pd.read_csv('data_posts.csv')
#convert created date to normal datetime
post_df['created_date']=post_df['post_created_utc'].apply(lambda x:dt.datetime.fromtimestamp(x))
post_df['created_year']=post_df['created_date'].dt.year
post_df
| post_id | post_title | post_text | post_url | post_score | post_num_comments | post_upvote_ratio | post_created_utc | created_date | created_year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 75jx65 | My brother, Adeolu Ogunniyi (24) has been miss... | NaN | https://imgur.com/a/S4AlS | 34248 | 371 | 0.91 | 1.507667e+09 | 2017-10-11 03:30:36 | 2017 |
| 1 | 9qyu4g | How to shake someone at a hostel? | I’m travelling solo for the first time in Japa... | https://www.reddit.com/r/solotravel/comments/9... | 8296 | 138 | 0.95 | 1.540381e+09 | 2018-10-24 18:37:13 | 2018 |
| 2 | 5m30z6 | I told myself I would go solo travel through E... | NaN | https://i.reddituploads.com/11b13e9b9d2b417688... | 7178 | 198 | 0.92 | 1.483576e+09 | 2017-01-05 07:29:54 | 2017 |
| 3 | 16c1of1 | The number of old sex tourists in Bangkok is i... | I am currently in Bangkok and the number of se... | https://www.reddit.com/r/solotravel/comments/1... | 5436 | 624 | 0.82 | 1.694048e+09 | 2023-09-07 07:50:58 | 2023 |
| 4 | fj4v2p | For those of you still travelling Europe despi... | I've seen about a million threads on this toda... | https://www.reddit.com/r/solotravel/comments/f... | 4777 | 544 | 0.95 | 1.584293e+09 | 2020-03-16 00:25:20 | 2020 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | o3kul3 | Is it a bad idea to quit a secure job to trave... | I've been at my job for quite a few years now,... | https://www.reddit.com/r/solotravel/comments/o... | 364 | 231 | 0.94 | 1.624123e+09 | 2021-06-20 00:14:40 | 2021 |
| 996 | mijvzb | How important is it to you to meet people whil... | I know everyone will have a different opinion ... | https://www.reddit.com/r/solotravel/comments/m... | 368 | 159 | 0.95 | 1.617368e+09 | 2021-04-02 20:01:11 | 2021 |
| 997 | lwta89 | Sites I check before travelling alone during t... | Hi guys. I’ve travelled a few times (during th... | https://www.reddit.com/r/solotravel/comments/l... | 368 | 110 | 0.87 | 1.614776e+09 | 2021-03-03 19:58:47 | 2021 |
| 998 | jbnzw8 | In which country you felt people were more fri... | Got some friends who were in Iran, and they ab... | https://www.reddit.com/r/solotravel/comments/j... | 360 | 538 | 0.95 | 1.602770e+09 | 2020-10-15 20:55:56 | 2020 |
| 999 | exmw4q | What's the number one thing you look for when ... | Personally... A curtain for the beds. Severely... | https://www.reddit.com/r/solotravel/comments/e... | 365 | 229 | 0.97 | 1.580641e+09 | 2020-02-02 18:00:01 | 2020 |
1000 rows × 10 columns
#create text
text="".join(str(p) for p in post_df['post_title'])
#generate word cloud:
def generate_wordcloud(new_text):
#Create stopwords:
stopwords=set(STOPWORDS)
stopwords.update(["https", "travel","I", "Edit","EDIT","etc","imgur","html","Hostel"
"Update","anyone","want"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords,width=800,height=800,min_font_size=10,
background_color="white",colormap="Set2",collocation_threshold =3).generate(text)
fig = plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
return fig
text_cloud = generate_wordcloud(text)
from ipywidgets import interact
def wordcloud_year(selected_year):
df_data=post_df[post_df['created_year']==selected_year]
text=''.join(str(p) for p in df_data['post_title'])
#Create stopwords:
stopwords=set(STOPWORDS)
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords,width=500,height=500,min_font_size=10,
background_color="white",colormap="Set2",collocation_threshold=3).generate(text)
fig=plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
interact(wordcloud_year,selected_year=post_df['created_year'].unique())
interactive(children=(Dropdown(description='selected_year', options=(2017, 2018, 2023, 2020, 2019, 2016, 2022,…
<function __main__.wordcloud_year(selected_year)>
# Extract destinations in post texts.
def find_destination(text_col):
countries=[]
text = ''.join(str(p) for p in text_col)
for country in pycountry.countries:
if country.name in text:
countries.append(country.name)
return countries
country_list=find_destination(post_df['post_text'])
map_data={'country':country_list}
map_data=pd.DataFrame(map_data)
map_data.head()
| country | |
|---|---|
| 0 | Aruba |
| 1 | Afghanistan |
| 2 | Angola |
| 3 | Albania |
| 4 | Andorra |
# Generate country code based on country name
def code_finding(country):
try:
code=pycountry.countries.get(name=country).alpha_3
except:
code=None
return code
map_data['country_code']=map_data['country'].apply(lambda x:code_finding(x))
# map destinations for solo travelers
fig=px.scatter_geo(map_data, locations='country_code',hover_name='country')
fig.update_layout(title="Destinations choosen by solo travellers(total=1000 posts)")
fig.show()
post_df.loc[:,'post_text']=post_df['post_text'].fillna('')
#find the occurrences of keywords of solo traveler segmentation.
def find_words(search_pattern,text_col):
count=0
for text in text_col:
matches=re.findall(search_pattern,text, re.IGNORECASE)
count+=len(matches)
return count
search_patterns=map_data['country']
total_count=[]
for p in search_patterns:
res=find_words(p,post_df['post_text'])
total_count.append(res)
print(total_count)
map_data['count_country']=total_count
map_data.head()
| country | country_code | count_country | |
|---|---|---|---|
| 0 | Aruba | ABW | 1 |
| 1 | Afghanistan | AFG | 7 |
| 2 | Angola | AGO | 2 |
| 3 | Albania | ALB | 20 |
| 4 | Andorra | AND | 1 |
# top 10 destinations for solo travelling
top_10_countries=map_data.sort_values(by='count_country', ascending=False)[:10]
top_10_countries.reset_index()
fig = px.bar(
top_10_countries,
x="count_country",
y="country",
labels={"count_country": "Count", "country": "Country "},text_auto=True,
orientation='h',
template="plotly_dark"
).update_yaxes(
categoryorder="total ascending",
)
fig.update_layout(
title="Top 10 destinations for solo travelling",
)
fig
pd.set_option('display.max_colwidth', 500)
# posts with the most scores
post_scores=post_df[['post_title','post_score']].sort_values(by='post_score',ascending=False)[:10]
post_scores
| post_title | post_score | |
|---|---|---|
| 0 | My brother, Adeolu Ogunniyi (24) has been missing since September 10, 2017. He was backpacking in Central America and last seen at Laguna De Apoyo in Nicaragua. If you've seen him or heard anything PLEASE contact me. (more details in the description) | 34248 |
| 1 | How to shake someone at a hostel? | 8296 |
| 2 | I told myself I would go solo travel through Europe if I ever made it 6 months without a seizure. Today is my first day abroad :) | 7178 |
| 3 | The number of old sex tourists in Bangkok is insane | 5436 |
| 4 | For those of you still travelling Europe despite the restrictions, GO HOME. | 4777 |
| 5 | Solo travel means waking up early when you want. No crowds at the Treasury! | 4713 |
| 6 | Afraid to go back to my hostel room | 4636 |
| 7 | Solo travel at any age - my first time backpacking Europe (age 17) VS last week, 22 years later - Prague | 3553 |
| 8 | I just shat in my hostel bed, what to do next? | 3488 |
| 9 | You guys told me about Hostel Uppelink in Ghent, Belgium and the view that some of the rooms had, I still wasn't expecting it to be quite like this... | 3465 |
#posts with the most comments
post_comments=post_df[['post_title','post_num_comments']].sort_values(by='post_num_comments',ascending=False)[:10]
post_comments
| post_title | post_num_comments | |
|---|---|---|
| 621 | What is a popular traveling spot that seems unappealing to you? | 1249 |
| 436 | What's a country you'd love to visit, but can't/won't? (non-pandemic reasons) | 995 |
| 279 | Top three favorite cities in the world? | 989 |
| 426 | What's one city that exceeded your expectations and one that left you a bit disappointed? | 962 |
| 255 | What is the worst poverty you have come across on your travels? | 949 |
| 808 | What city/place did you NOT feel safe in? | 868 |
| 458 | What are the tourist traps in your city that should be avoided? | 863 |
| 553 | Places you have visited and would not return? | 846 |
| 10 | An unfortunate reminder for other young female solo travelers | 775 |
| 343 | Rant about how Coronavirus ruined your trip | 764 |
# Number of posts over the years
post_by_year=post_df[['post_id','created_year']].groupby('created_year').size().reset_index(name='count')
# plot the line graph for numbers of posts over the years
fig=px.line(post_by_year,x='created_year',y='count',title='Number of posts through the time(total posts=1000)')
fig.show()
# Relationship of upvote number and comment number
fig=px.scatter(post_df,x='post_score',y='post_num_comments',title='Upvote number and comment number(total=1000 posts)')
fig.show()
# Relationship of upvote ratio and comment number
fig=px.scatter(post_df,x='post_upvote_ratio',y='post_num_comments',title='Upvote ratio and comment number(total=1000 posts)')
fig.show()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# mean of upvote ratio
post_df['post_upvote_ratio'].mean()
#Top largest upvote ratio
post_df['post_upvote_ratio'].nlargest(10)
#Top smallest upvote ratio
post_df['post_upvote_ratio'].nsmallest(10)
0.9476600000000102
15 0.99 19 0.99 20 0.99 23 0.99 32 0.99 33 0.99 44 0.99 45 0.99 59 0.99 63 0.99 Name: post_upvote_ratio, dtype: float64
268 0.73 421 0.73 665 0.75 677 0.76 156 0.78 694 0.78 609 0.79 528 0.80 758 0.80 524 0.81 Name: post_upvote_ratio, dtype: float64